# -*- coding:utf-8 -*-
import urllib
import simplejson
import re
import os
import random

def craw_lrc_by_mid(mid):
    """
    http://music.baidu.com/data2/lrc/13762828/13762828.lrc
    """
    music_url = "http://music.baidu.com/song/"+str(mid)
    html_str = urllib.urlopen(music_url).read()
    
    lrc_url_regex = "/data2/lrc/(\s|\S)*?.lrc"
    lrcm = re.search(lrc_url_regex,html_str)
    if lrcm:
        lrc_url = html_str[lrcm.start():lrcm.end()]
        lrc_url = "http://music.baidu.com"+lrc_url
        lrc_str = urllib.urlopen(lrc_url).read()
        return lrc_str
    return ""

if __name__ == "__main__":
    #lrc_str = craw_lrc_by_mid("769377")
    #print lrc_str
    #for json_file in os.listdir('./data'):
    for json_file in reversed(os.listdir('./data')):
        if not json_file.endswith("json"):
            continue
        print json_file

        json_path = "./data/"+json_file
        m_list = simplejson.load(open(json_path))
        #for m in reversed(m_list):
        for m in reversed(m_list):
            mid = m['title']['id']
            out_path  = "./lyrics/"+str(mid)+'.lrc'
            if os.path.isfile(out_path):
                continue
            #if random.randint(0,1) == 0:
            #    print 'random'
            #    continue
            print mid
            outf = open(out_path,'w')
            print>>outf,craw_lrc_by_mid(mid)
            outf.close()

